Preparations

Load the necessary libraries

library(car)       #for regression diagnostics
library(broom)     #for tidy output
library(ggfortify) #for model diagnostics
library(sjPlot)    #for outputs
library(knitr)     #for kable
library(effects)   #for partial effects plots
library(emmeans)   #for estimating marginal means
library(ggeffects) #for plotting marginal means
library(MASS)      #for glm.nb
library(MuMIn)     #for AICc
library(tidyverse) #for data wrangling
library(modelr)    #for auxillary modelling functions
library(performance) #for residuals diagnostics
library(see)         #for plotting residuals
library(DHARMa)    #for residual diagnostics plots
library(patchwork) #grid of plots
library(scales)    #for more scales
theme_set(theme_classic())

Scenario

Here is a modified example from Quinn and Keough (2002). Day and Quinn (1989) described an experiment that examined how rock surface type affected the recruitment of barnacles to a rocky shore. The experiment had a single factor, surface type, with 4 treatments or levels: algal species 1 (ALG1), algal species 2 (ALG2), naturally bare surfaces (NB) and artificially scraped bare surfaces (S). There were 5 replicate plots for each surface type and the response (dependent) variable was the number of newly recruited barnacles on each plot after 4 weeks.

Six-plated barnacle

Format of day.csv data files

treat barnacle
ALG1 27
.. ..
ALG2 24
.. ..
NB 9
.. ..
S 12
.. ..
treat Categorical listing of surface types. ALG1 = algal species 1, ALG2 = algal species 2, NB = naturally bare surface, S = scraped bare surface.
barnacle The number of newly recruited barnacles on each plot after 4 weeks.

Read in the data

As we are going to treat Treatment as a categorical predictor, we will specifically declare it as such straight after importing the data.

day = read_csv('../data/day.csv', trim_ws=TRUE)
glimpse(day)
## Rows: 20
## Columns: 2
## $ TREAT    <chr> "ALG1", "ALG1", "ALG1", "ALG1", "ALG1", "ALG2", "ALG2", "ALG…
## $ BARNACLE <dbl> 27, 19, 18, 23, 25, 24, 33, 27, 26, 32, 9, 13, 17, 14, 22, 1…
day <- day %>% janitor::clean_names() %>%
  mutate(treat = fct_relevel(treat, c("NB", "S", "ALG1", "ALG2")))

Exploratory data analysis

Model formula: \[ y_i \sim{} \mathcal{Pois}(\lambda_i)\\ \mu_i = \boldsymbol{\beta} \bf{X_i} \]

where \(\boldsymbol{\beta}\) is a vector of effects parameters and \(\bf{X}\) is a model matrix representing the intercept and treatment contrasts for the effects of Treatment on barnacle recruitment.

ggplot(day, aes(y=barnacle, x=treat)) +
    geom_boxplot()+
    geom_point(color='red')

ggplot(day, aes(y=barnacle, x=treat)) +
    geom_violin()+
    geom_point(color='red')

Fit the model

day_mod <- glm(barnacle ~ treat, data = day, family = poisson(link = "log"))

Model validation

autoplot(day_mod, which = 1:6)

DHARMa::simulateResiduals(day_mod, plot=T)

## Object of Class DHARMa with simulated residuals based on 250 simulations with refit = FALSE . See ?DHARMa::simulateResiduals for help. 
##  
## Scaled residual values: 0.8519155 0.2442125 0.1780693 0.5192783 0.7018186 0.2069486 0.845717 0.4237525 0.3734334 0.797562 0.05894113 0.3008211 0.7262854 0.3287856 0.9518321 0.4402874 0.06980402 0.6823056 0.9620096 0.2676171 ...

All looks good!

Cook’s d is not needed if you only have categorical predictors

Partial plots

plot_model(day_mod, type = 'eff')
## $treat

Model investigation / hypothesis testing

summary(day_mod)
## 
## Call:
## glm(formula = barnacle ~ treat, family = poisson(link = "log"), 
##     data = day)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6748  -0.6522  -0.2630   0.5699   1.7380  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   2.7081     0.1155  23.452  < 2e-16 ***
## treatS       -0.1278     0.1688  -0.757   0.4488    
## treatALG1     0.4010     0.1492   2.688   0.0072 ** 
## treatALG2     0.6383     0.1427   4.472 7.75e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 54.123  on 19  degrees of freedom
## Residual deviance: 17.214  on 16  degrees of freedom
## AIC: 120.34
## 
## Number of Fisher Scoring iterations: 4
exp(2.7081)
## [1] 15.00075
exp(2.7081 + -0.1278)
## [1] 13.2011
exp(2.7081 + 0.4010)
## [1] 22.40087
exp(2.7081 + 0.6383)
## [1] 28.40031

exp(2.7081) = 15.0 is the number of barnacles in the NB treatment 13.2 is the number of barnacles in the S treatment (also exp(-0.1278) is the factor or fractional difference) 22.4 is the number of barnacles in the ALG1 treatment 28.4 is the number of barnacles in the ALG2 treatment

tidy(day_mod, confint = T)
tidy(day_mod, confint = T, exponentiate = T)
# fancy table:
# sjPlot::tab_model(day_mod, show.se = T, show.aic = T)

Predictions

day_mod %>% emmeans(~treat, type = 'response') %>% plot(add.data=T)

Post-hoc test (Tukey’s)

emmeans(day_mod, pairwise~treat)
## $emmeans
##  treat emmean     SE  df asymp.LCL asymp.UCL
##  NB      2.71 0.1155 Inf      2.48      2.93
##  S       2.58 0.1231 Inf      2.34      2.82
##  ALG1    3.11 0.0945 Inf      2.92      3.29
##  ALG2    3.35 0.0839 Inf      3.18      3.51
## 
## Results are given on the log (not the response) scale. 
## Confidence level used: 0.95 
## 
## $contrasts
##  contrast    estimate    SE  df z.ratio p.value
##  NB - S         0.128 0.169 Inf  0.757  0.8735 
##  NB - ALG1     -0.401 0.149 Inf -2.688  0.0362 
##  NB - ALG2     -0.638 0.143 Inf -4.472  <.0001 
##  S - ALG1      -0.529 0.155 Inf -3.408  0.0037 
##  S - ALG2      -0.766 0.149 Inf -5.143  <.0001 
##  ALG1 - ALG2   -0.237 0.126 Inf -1.878  0.2375 
## 
## Results are given on the log (not the response) scale. 
## P value adjustment: tukey method for comparing a family of 4 estimates
emmeans(day_mod, pairwise~treat, type = 'response')
## $emmeans
##  treat rate   SE  df asymp.LCL asymp.UCL
##  NB    15.0 1.73 Inf      12.0      18.8
##  S     13.2 1.62 Inf      10.4      16.8
##  ALG1  22.4 2.12 Inf      18.6      27.0
##  ALG2  28.4 2.38 Inf      24.1      33.5
## 
## Confidence level used: 0.95 
## Intervals are back-transformed from the log scale 
## 
## $contrasts
##  contrast    ratio     SE  df z.ratio p.value
##  NB / S      1.136 0.1918 Inf  0.757  0.8735 
##  NB / ALG1   0.670 0.0999 Inf -2.688  0.0362 
##  NB / ALG2   0.528 0.0754 Inf -4.472  <.0001 
##  S / ALG1    0.589 0.0914 Inf -3.408  0.0037 
##  S / ALG2    0.465 0.0692 Inf -5.143  <.0001 
##  ALG1 / ALG2 0.789 0.0997 Inf -1.878  0.2375 
## 
## P value adjustment: tukey method for comparing a family of 4 estimates 
## Tests are performed on the log scale
# emmeans(day_mod, ~treat) %>% pairs() %>% confint() # similar to second table
1/0.465 # ALG2 / S
## [1] 2.150538

ALG2 is 2.15 times more than S

emmeans(day_mod, ~ treat) %>% # get differences
  regrid() %>%  # does backtransform of the differences BEFORE pair-wise comparisons
  pairs %>% # get pair-wise absolute values of differences
  confint() # get confidence interval limits for these differences

ALG2 has 13.4 more units of barnacles than NB ALG1 has 7.4 more units of barnacles than NB ALG2 has 15.2 more units of barnacles than S ALG1 has 9.2 more units of barnacles than S

Planned contrasts

emmeans(day_mod, ~ treat) %>%
  regrid() %>%
  pairs() %>%
  confint()

Define your own

Compare:

  1. ALG1 vs ALG2
  2. NB vs S
  3. average of ALG1+ALG2 vs NB+S

Two rules to planned contrasts:

  1. Can only use # groups - 1 planned comparisons
  2. Each of these comparisons must be independent of one another
cmat <- cbind('Alg1_Alg2' = c(1, -1, 0, 0),
              'NB_S' = c(0, 0, 1, -1),
              'Alg_Bare' = c(0.5, 0.5, -0.5, -0.5))

Need to check that they’re all independent

crossprod(cmat) #each column against each other column
##           Alg1_Alg2 NB_S Alg_Bare
## Alg1_Alg2         2    0        0
## NB_S              0    2        0
## Alg_Bare          0    0        1
day_mod %>%
  emmeans(~treat, contr = list(treat = cmat), type = 'response') #'response' to transform
## $emmeans
##  treat rate   SE  df asymp.LCL asymp.UCL
##  NB    15.0 1.73 Inf      12.0      18.8
##  S     13.2 1.62 Inf      10.4      16.8
##  ALG1  22.4 2.12 Inf      18.6      27.0
##  ALG2  28.4 2.38 Inf      24.1      33.5
## 
## Confidence level used: 0.95 
## Intervals are back-transformed from the log scale 
## 
## $contrasts
##  contrast        ratio     SE  df z.ratio p.value
##  treat.Alg1_Alg2 1.136 0.1918 Inf  0.757  0.4488 
##  treat.NB_S      0.789 0.0997 Inf -1.878  0.0604 
##  treat.Alg_Bare  0.558 0.0588 Inf -5.536  <.0001 
## 
## Tests are performed on the log scale
day_mod %>%
  emmeans(~treat) %>%
  regrid() %>%
  contrast(list(TREAT = cmat))
##  contrast        estimate   SE  df z.ratio p.value
##  TREAT.Alg1_Alg2      1.8 2.37 Inf  0.758  0.4485 
##  TREAT.NB_S          -6.0 3.19 Inf -1.882  0.0598 
##  TREAT.Alg_Bare     -11.3 1.99 Inf -5.686  <.0001

Summary figures

newdata <- emmeans(day_mod, ~treat, type = 'response') %>%
  as.data.frame() %>% 
  rename(barnacle = rate, lwr = asymp.LCL, upr = asymp.UCL)

ggplot(newdata, aes(y = barnacle, x = treat)) +
  geom_pointrange(aes(ymin = lwr, ymax = upr)) +
  geom_jitter(data = day, col="red", height = 0, width=0.1)

newdata_planned <- day_mod %>% 
  emmeans(~treat) %>% 
  regrid() %>%
  contrast(list(treat = cmat)) %>%
  confint() %>%
  rename(lwr = asymp.LCL, upr = asymp.UCL)

ggplot(data = newdata_planned, aes(y = estimate, x = contrast)) + 
  geom_hline(yintercept = 0, linetype = 'dashed') + 
  geom_pointrange(aes(ymin = lwr, ymax = upr)) + 
  theme_classic() +
  coord_flip()

Ways of doing multiple plots with inset ‘a’ and ’b’s:

# 
# (g1 + ggtitle('a)')) + (g2 + ggtitle('b)'))
#OR
# g1 + annotate(geom = 'text', y = Inf, x = -Inf, label = 'a)', 
#               hjust = -0.5, vjust = 1)

References

Quinn, G. P., and K. J. Keough. 2002. Experimental Design and Data Analysis for Biologists. London: Cambridge University Press.